{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['paws-train.json',\n",
       " 'paws-dev.json',\n",
       " 'paws-test.json',\n",
       " 'mrpc-train.json',\n",
       " 'parasci-arxiv-train.json',\n",
       " 'parasci-arxiv-val.json',\n",
       " 'parasci-test.json',\n",
       " 'parasci-train.json',\n",
       " 'parasci-val.json']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from glob import glob\n",
    "\n",
    "parasci = sorted(glob('parasci-*.json'))\n",
    "parasci = [f for f in parasci if 'parasci-arxiv-test' not in f]\n",
    "files = ['paws-train.json', 'paws-dev.json', 'paws-test.json', 'mrpc-train.json'] + parasci\n",
    "files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['funpedia-train.json', 'funpedia-test.json', 'funpedia-valid.json']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "funpedia = glob('funpedia*')\n",
    "funpedia"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(funpedia[0]) as fopen:\n",
    "    data = json.load(fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(81467, 51585)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rejected = {'i', 'you', 'she', 'her', 'he', 'them'}\n",
    "\n",
    "selected = []\n",
    "for d in data:\n",
    "    if len(set(d['label'].lower().split()) & rejected):\n",
    "        continue\n",
    "        \n",
    "    if len(set(d['passage'].lower().split()) & rejected):\n",
    "        continue\n",
    "    \n",
    "    selected.append(d)\n",
    "    \n",
    "len(data), len(selected)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'title': 'Soldier Mountain',\n",
       "  'label': 'What a nice place to go above sea level, Soldier Mountain is',\n",
       "  'passage': 'Located north of Fairfield in very rural Camas County, its summit elevation is above sea level with a vertical drop of .',\n",
       "  'persona': 'Caring',\n",
       "  'ms': ['Tempat yang bagus untuk naik ke atas permukaan laut, Soldier Mountain adalah',\n",
       "   'Terletak di utara Fairfield di Camas County yang sangat luar bandar, ketinggian puncaknya berada di atas paras laut dengan penurunan menegak.']},\n",
       " {'title': 'Tyssøy',\n",
       "  'label': 'Incredible, the island Tyssoy is all away towards the north in the country',\n",
       "  'passage': 'The island lies in the northern part of the Raunefjorden.',\n",
       "  'persona': 'Profound',\n",
       "  'ms': ['Luar biasa, pulau Tyssoy terletak di utara di negara ini',\n",
       "   'Pulau ini terletak di bahagian utara Raunefjorden.']},\n",
       " {'title': 'Byfjorden (Rogaland)',\n",
       "  'label': 'It would be so liberating to live in a fjord like Byfjorden in Norway.',\n",
       "  'passage': 'Byfjorden is a fjord in Rogaland county, Norway.',\n",
       "  'persona': 'Freethinking',\n",
       "  'ms': ['Ia akan menjadi sangat membebaskan untuk tinggal di fjord seperti Byfjorden di Norway.',\n",
       "   'Byfjorden merupakan sebuah fjord di daerah Rogaland, Norway.']},\n",
       " {'title': 'Max Landis',\n",
       "  'label': 'Max Landis is a comic book writer who wrote Chronicle, American Ultra, and Victor Frankestein.',\n",
       "  'passage': 'Max Landis (born August 3, 1985) is an American screenwriter, director, producer, and comic book writer who wrote the films \"Chronicle\" (2012), \"American Ultra\" (2015), \"Victor Frankenstein\" (2015), and \"Bright\" (2017), as well as a variety of short films including \"The Death and Return of Superman\" and \"Wrestling Isn\\'t Wrestling\".',\n",
       "  'persona': 'Humorous',\n",
       "  'ms': ['Max Landis adalah penulis buku komik yang menulis Chronicle, American Ultra, dan Victor Frankestein.',\n",
       "   'Max Landis (lahir 3 Ogos 1985) ialah seorang penulis skrip, pengarah, penerbit, dan penulis buku komik Amerika yang menulis filem-filem \"Chronicle\" (2012), \"American Ultra\" (2015), \"Victor Frankenstein\" (2015), dan \"Bright\" (2017), serta pelbagai filem pendek termasuk \"The Death and Return of Superman\" dan \"Wrestling Isn\\'t Wrestling\".']},\n",
       " {'title': 'Elmer Ernest Southard',\n",
       "  'label': 'Elmer Ernset Southard was a neuropsychiatrist, neuropathologist, professor and author who was born in the United States',\n",
       "  'passage': 'Elmer Ernest Southard (July 28, 1876February 8, 1920) was an American neuropsychiatrist, neuropathologist, professor and author.',\n",
       "  'persona': 'Idealistic',\n",
       "  'ms': ['Elmer Ernset Southard adalah ahli neuropsikologi, neuropatologi, profesor dan pengarang yang dilahirkan di Amerika Syarikat.',\n",
       "   'Elmer Ernest Southard (28 Julai 1876 8 Februari 1920) adalah seorang ahli neuropsikologi, neuropatologi, profesor dan pengarang Amerika.']},\n",
       " {'title': 'Tribes of Caïn',\n",
       "  'label': 'Tribes of Cain is a band from swtizerland who released 3 albums',\n",
       "  'passage': 'Tribes of Caïn is a heavy metal band from Switzerland, formed in 1999, with three albums released.',\n",
       "  'persona': 'Spirited',\n",
       "  'ms': ['Tribes of Cain adalah kumpulan dari swtizerland yang mengeluarkan 3 album',\n",
       "   'Tribes of Can adalah kumpulan heavy metal dari Switzerland, yang dibentuk pada tahun 1999, dengan tiga album dikeluarkan.']},\n",
       " {'title': 'Félix Stevens',\n",
       "  'label': 'Félix stevens is a retired cuban sprinter who competed in the 80s.',\n",
       "  'passage': 'Félix Stevens (born March 8, 1964) is a retired male sprinter from Cuba, who competed in the 1980s and the early 1990s for his native country.',\n",
       "  'persona': 'Confident',\n",
       "  'ms': ['Flix stevens adalah pelari pecut Cuba yang bersara yang bertanding pada tahun 80-an.',\n",
       "   'Flix Stevens (lahir 8 Mac 1964) adalah seorang pelari pecut lelaki bersara dari Cuba, yang bertanding pada tahun 1980-an dan awal 1990-an untuk negara asalnya.']},\n",
       " {'title': 'Bayside, Wisconsin',\n",
       "  'label': 'Bayside has a population of 4389 in the 2010 census',\n",
       "  'passage': 'Bayside is a village in Milwaukee and Ozaukee counties in the U.S. state of Wisconsin.',\n",
       "  'persona': 'Caring',\n",
       "  'ms': ['Bayside mempunyai penduduk 4389 pada banci 2010',\n",
       "   'Bayside adalah sebuah kampung di Milwaukee dan daerah Ozaukee di negeri Wisconsin A.S..']},\n",
       " {'title': 'Arthur Engelbert',\n",
       "  'label': 'Arthur Engelbert is a professor at the University of Applied Sciences',\n",
       "  'passage': 'Arthur Engelbert (born 1951 in Werdohl) is professor in media theory and art sciences at the University of Applied Sciences Potsdam.',\n",
       "  'persona': 'Creative',\n",
       "  'ms': ['Arthur Engelbert adalah profesor di Universiti Sains Gunaan',\n",
       "   'Arthur Engelbert (lahir 1951 di Werdohl) ialah profesor dalam teori media dan sains seni di Universiti Sains Gunaan Potsdam.']},\n",
       " {'title': 'Barron (horse)',\n",
       "  'label': 'Barron is a horse who specialises in show jumping',\n",
       "  'passage': 'Barron is a show jumping horse.',\n",
       "  'persona': 'Honest',\n",
       "  'ms': ['Barron adalah kuda yang pakar dalam pertunjukan melompat',\n",
       "   'Barron adalah kuda melompat pertunjukan.']}]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "selected[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████| 21829/21829 [00:00<00:00, 200491.52it/s]\n",
      "100%|███████████████████████████████████| 3539/3539 [00:00<00:00, 195352.20it/s]\n",
      "100%|███████████████████████████████████| 3536/3536 [00:00<00:00, 188555.98it/s]\n",
      "100%|███████████████████████████████████| 2474/2474 [00:00<00:00, 173482.93it/s]\n",
      "100%|███████████████████████████████| 309834/309834 [00:01<00:00, 184177.08it/s]\n",
      "100%|███████████████████████████████████| 3680/3680 [00:00<00:00, 176911.97it/s]\n",
      "100%|███████████████████████████████████| 2345/2345 [00:00<00:00, 170686.57it/s]\n",
      "100%|█████████████████████████████████| 28883/28883 [00:00<00:00, 175805.80it/s]\n",
      "100%|███████████████████████████████████| 2753/2753 [00:00<00:00, 177189.67it/s]\n"
     ]
    }
   ],
   "source": [
    "with open('train.json', 'w') as fopen_jsonl:\n",
    "    \n",
    "    for f in files:\n",
    "        with open(f) as fopen:\n",
    "            data = json.load(fopen)\n",
    "    \n",
    "        for i in tqdm(range(len(data))):\n",
    "            ms = data[i]['ms']\n",
    "            if len(ms) != 2:\n",
    "                continue\n",
    "\n",
    "            left = ms[0]\n",
    "            right = ms[1]\n",
    "\n",
    "            if left == right:\n",
    "                continue\n",
    "\n",
    "            if len(left) and len(right) and len(left.split()) < 256 and len(right.split()) < 256:\n",
    "                d = {\"translation\": {\"src\": left, \"tgt\": right, 'prefix': 'parafrasa: '}}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "                d = {\"translation\": {\"src\": right, \"tgt\": left, 'prefix': 'parafrasa: '}}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "745890 train.json\r\n"
     ]
    }
   ],
   "source": [
    "!wc -l train.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|█████████████████████████████████████| 279/279 [00:00<00:00, 161675.99it/s]\n",
      "100%|███████████████████████████████████| 2549/2549 [00:00<00:00, 174269.85it/s]\n"
     ]
    }
   ],
   "source": [
    "with open('test.json', 'w') as fopen_jsonl:\n",
    "    \n",
    "    files = ['mrpc-val.json', 'parasci-arxiv-test.json']\n",
    "    for f in files:\n",
    "        with open(f) as fopen:\n",
    "            data = json.load(fopen)\n",
    "        for i in tqdm(range(len(data))):\n",
    "            ms = data[i]['ms']\n",
    "            if len(ms) != 2:\n",
    "                continue\n",
    "\n",
    "            left = ms[0]\n",
    "            right = ms[1]\n",
    "\n",
    "            if left == right:\n",
    "                continue\n",
    "\n",
    "            if len(left) and len(right) and len(left.split()) < 256 and len(right.split()) < 256:\n",
    "                d = {\"translation\": {\"src\": left, \"tgt\": right, 'prefix': 'parafrasa: '}}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')\n",
    "                d = {\"translation\": {\"src\": right, \"tgt\": left, 'prefix': 'parafrasa: '}}\n",
    "                fopen_jsonl.write(f'{json.dumps(d)}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5544 test.json\r\n"
     ]
    }
   ],
   "source": [
    "!wc -l test.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "!shuf train.json > shuffled-train.json\n",
    "!shuf test.json > shuffled-test.json"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
